import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
df = pd.read_csv("data/lightcast_job_postings.csv")
columns_to_keep = [
'COMPANY', 'LOCATION', 'POSTED', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS_NAME',
'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'TITLE', 'SKILLS',
'SPECIALIZED_SKILLS', 'CERTIFICATIONS', 'COMMON_SKILLS', 'SOFTWARE_SKILLS',
'SOC_2021_4_NAME', 'NAICS_2022_6', 'NAICS2_NAME', 'REMOTE_TYPE_NAME',
'SALARY', 'TITLE_NAME', 'SKILLS_NAME', 'SPECIALIZED_SKILLS_NAME', 'BODY'
]
eda_data = df[columns_to_keep]Cleaning
missing_matrix = eda_data.isnull().astype(int)
corr = missing_matrix.corr().round(2)
mask = np.triu(np.ones(corr.shape), k=1).astype(bool)
masked_corr = corr.mask(mask)
text_labels = masked_corr.astype(str)
text_labels[masked_corr.isna()] = ""
# plot
fig = go.Figure(data=go.Heatmap(
z=masked_corr.values,
x=masked_corr.columns,
y=masked_corr.index,
text=text_labels.values,
texttemplate="%{text}",
colorscale="Blues",
colorbar=dict(title="Missing Corr"),
zmin=0,
zmax=1,
hoverinfo='skip'
))
fig.update_layout(
title="Clean Triangle Missing Value Correlation Heatmap",
xaxis_tickangle=45,
width=850,
height=600,
margin=dict(t=50, l=80, r=50, b=80),
font=dict(size=8),
plot_bgcolor='white'
)
fig.update_yaxes(autorange="reversed")
fig.show()This triangle heatmap visualizes the correlation of missing values between different columns in the dataset. Each square represents how often two columns are missing together, with darker blue indicating a stronger relationship. Most of the values are very high (close to 1.0), suggesting that when one column is missing, others are often missing too — especially among skill-related fields like SKILLS, SPECIALIZED_SKILLS, and SOFTWARE_SKILLS, which are likely part of the same job posting metadata.
This pattern indicates that missingness is not random, but structured — possibly due to differences in how job descriptions are recorded across roles or industries. For example, a job with no software skill tags might also lack common skills or NAICS codes, hinting at data input gaps rather than actual job content differences. Recognizing these correlations is helpful for choosing imputation strategies or deciding whether to drop certain rows or columns entirely during preprocessing.
if "SALARY" in eda_data.columns:
eda_data["SALARY"].fillna(eda_data["SALARY"].median(), inplace=True)
else:
print("Warning: 'SALARY' column not found in dataframe!")
if "COMPANY" in eda_data.columns:
eda_data["COMPANY"].fillna("Unknown", inplace=True)
else:
print("Warning: 'COMPANY' column not found in dataframe!")
# Fill numeric columns with mean
num_cols = eda_data.select_dtypes(include='number').columns
for col in num_cols:
if eda_data[col].isnull().sum() > 0:
eda_data[col].fillna(eda_data[col].mean(), inplace=True)
# Fill categorical columns with mode
cat_cols = eda_data.select_dtypes(include='object').columns
for col in cat_cols:
if eda_data[col].isnull().sum() > 0:
eda_data[col].fillna(eda_data[col].mode()[0], inplace=True)
eda_data.dropna(thresh=len(eda_data) * 0.5, axis=1, inplace=True)
# delete duplicates
eda_data = eda_data.drop_duplicates(subset=["TITLE", "COMPANY", "LOCATION", "POSTED","BODY"])
eda_data['BODY'] = eda_data['BODY'].str.slice(0, 1000)
eda_data['BODY'] = eda_data['BODY'].astype(str)
eda_data['COMPANY'] = eda_data['COMPANY'].astype(str)import pandas as pd
eda_data.to_parquet('data/eda.parquet', engine='pyarrow', compression='gzip')--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version) 134 try: --> 135 module = importlib.import_module(name) 136 except ImportError: File /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/importlib/__init__.py:88, in import_module(name, package) 87 level += 1 ---> 88 return _bootstrap._gcd_import(name[level:], package, level) File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level) File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_) File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_) ModuleNotFoundError: No module named 'pyarrow' During handling of the above exception, another exception occurred: ImportError Traceback (most recent call last) Cell In[4], line 2 1 import pandas as pd ----> 2 eda_data.to_parquet('data/eda.parquet', engine='pyarrow', compression='gzip') File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 327 if len(args) > num_allow_args: 328 warnings.warn( 329 msg.format(arguments=_format_argument_list(allow_args)), 330 FutureWarning, 331 stacklevel=find_stack_level(), 332 ) --> 333 return func(*args, **kwargs) File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/core/frame.py:3113, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs) 3032 """ 3033 Write a DataFrame to the binary parquet format. 3034 (...) 3109 >>> content = f.read() 3110 """ 3111 from pandas.io.parquet import to_parquet -> 3113 return to_parquet( 3114 self, 3115 path, 3116 engine, 3117 compression=compression, 3118 index=index, 3119 partition_cols=partition_cols, 3120 storage_options=storage_options, 3121 **kwargs, 3122 ) File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:476, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, filesystem, **kwargs) 474 if isinstance(partition_cols, str): 475 partition_cols = [partition_cols] --> 476 impl = get_engine(engine) 478 path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path 480 impl.write( 481 df, 482 path_or_buf, (...) 488 **kwargs, 489 ) File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:78, in get_engine(engine) 67 raise ImportError( 68 "Unable to find a usable engine; " 69 "tried using: 'pyarrow', 'fastparquet'.\n" (...) 74 f"{error_msgs}" 75 ) 77 if engine == "pyarrow": ---> 78 return PyArrowImpl() 79 elif engine == "fastparquet": 80 return FastParquetImpl() File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:163, in PyArrowImpl.__init__(self) 162 def __init__(self) -> None: --> 163 import_optional_dependency( 164 "pyarrow", extra="pyarrow is required for parquet support." 165 ) 166 import pyarrow.parquet 168 # import utils to register the pyarrow extension types File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/compat/_optional.py:138, in import_optional_dependency(name, extra, errors, min_version) 136 except ImportError: 137 if errors == "raise": --> 138 raise ImportError(msg) 139 return None 141 # Handle submodules: if we have submodule, grab parent module from sys.modules ImportError: Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.